9 Association testing with simulated phenotypes

library(here)
source(here::here("book/source/04-Association_testing.R"))

9.1 Read in F2 recombination blocks

9.1.1 Read data

in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks/20211027"

in_files = list.files(in_dir, pattern = "F2_", full.names = T)

# Read into list
data_list = purrr::map(in_files, function(FILE){
  out = readr::read_tsv(FILE,
                        col_types = "ciiidii")
})
# Set names as bin length
names(data_list) = basename(in_files) %>% 
  stringr::str_split("_", simplify = T) %>% 
  subset(select = 2) %>% 
  stringr::str_remove(".txt")
# Reorder
data_list = data_list[order(as.numeric(names(data_list)))]

counter = 0
df_list = purrr::map(data_list, function(data){
  counter <<- counter + 1
  # set bin length
  bin_length = as.numeric(names(data_list)[counter])
  # add bin start and end coordinates
  df = data %>% 
    dplyr::mutate(SAMPLE = basename(sample) %>% 
                    stringr::str_remove(".txt") %>% 
                    as.numeric(.),
                  BIN_START = (bin - 1) * bin_length + 1,
                  BIN_END = bin * bin_length) %>% 
    # recode state to make 0 == "Cab"
    dplyr::mutate(STATE = dplyr::recode(state,
                                        `0` = 2,
                                        `1` = 1,
                                        `2` = 0)) %>% 
    dplyr::select(SAMPLE, CHROM = chr, BIN = bin, BIN_START, BIN_END, STATE)
  
  return(df)
})

9.1.2 How many possible blocks have at least one call?

9.1.2.1 Count total existing bins

bin_lengths = as.integer(names(df_list))
names(bin_lengths) = bin_lengths

n_bins = purrr::map(bin_lengths, function(BIN_LENGTH){
  purrr::map_int(med_chr_lens$end, function(CHR_END){
    out = seq(from = 1, to = CHR_END, by = BIN_LENGTH)
    
    return(length(out))
  })
})

n_bins_total = purrr::map_int(n_bins, sum)

9.1.2.2 Proportion of total bins with calls

# Get total number of samples
n_samples = df_list$`5000`$SAMPLE %>%
  unique(.) %>% 
  length(.)

# Build DF of bins
n_bins_df = purrr::map_dfr(1:length(df_list), function(COUNTER){
  # Bin length
  bin_length = as.numeric(names(df_list)[COUNTER])
  # Number of total bins
  n_bins = n_bins_total[COUNTER]
  # Number of bins with calls
  n_bins_with_calls = df_list[[COUNTER]] %>% 
    dplyr::distinct(CHROM, BIN_START) %>% 
    nrow(.)
  # Number of bins with calls for each 
  n_bins_no_missing = df_list[[COUNTER]] %>% 
    dplyr::count(CHROM, BIN_START) %>%
    dplyr::filter(n == n_samples) %>%
    nrow(.)
  
  # Build final data frame
  out = tibble::tibble(BIN_LENGTH = bin_length,
                       N_BINS = n_bins,
                       N_BINS_WITH_CALLS = n_bins_with_calls,
                       N_BINS_NO_MISSING = n_bins_no_missing) %>% 
    dplyr::mutate(PROP_BINS_WITH_CALLS = N_BINS_WITH_CALLS / N_BINS,
                  PROP_BINS_NO_MISSING = N_BINS_NO_MISSING / N_BINS )
  
  return(out)
})

DT::datatable(n_bins_df)

9.1.3 Merge recombination blocks

9.1.3.1 List with final recoded genotypes (including NAs)

gt_list = purrr::map(df_list, function(BIN_LENGTH_DF){
  
  # Widen data frame
  gt_final = BIN_LENGTH_DF %>% 
    tidyr::pivot_wider(names_from = SAMPLE, values_from = STATE)
  
  # Pull out matrix of genotypes
  gt_mat = as.matrix(gt_final[, 5:ncol(gt_final)])
  
  # Get indexes of loci with > 1 genotype
  bins_to_keep = logical()
  
  for (ROW in 1:nrow(gt_mat)){
    # get unique values in each row
    out = unique(gt_mat[ROW, ])
    # remove NAs
    out = out[!is.na(out)]
    # if more than one value, return TRUE
    if (length(out) > 1) {
      bins_to_keep[ROW] = TRUE
    }
    # if just one value (i.e. if all samples are the same genotype at that locus), return false 
    else {
      bins_to_keep[ROW] = FALSE
    }
  }
  
  # filter gt_final
  gt_filt = gt_final %>% 
    dplyr::filter(bins_to_keep) %>% 
    # recode genotypes to -1, 0, 1
    dplyr::mutate(dplyr::across(-c("CHROM", "BIN", "BIN_START", "BIN_END"),
                                ~dplyr::recode(.x,
                                               `0` = -1,
                                               `1` = 0,
                                               `2` = 1))) %>% 
    # order
    dplyr::arrange(CHROM, BIN_START)  
  
  return(gt_filt)
}) 

# Show first 10 columns
gt_list$`20000`[, 1:10] %>% 
  head(.) %>% 
  DT::datatable(.) 

9.1.3.2 List with final recoded genotypes (complete cases only)

gt_nomiss_list = purrr::map(gt_list, function(BIN_LENGTH_DF){
  BIN_LENGTH_DF %>%
    dplyr::filter(complete.cases(.))
})

9.2 Simulate phenotype

9.2.1 Extract samples of genotypes

# Set directory
out_dir_test = file.path(lts_dir, "association_testing/20211027_test")

These must be written to a file because PhenotypeSimulator reads delimited genotypes from files.

# Get random 10 loci
set.seed(10)
n_loci = 10
# NOTE: PhenotypeSimulator::readStandardGenotypes states that the genotype file must
# delim: a [delimter]-delimited file of [(NrSNPs+1) x (NrSamples+1)] genotypes with the snpIDs in the first column and the sampleIDs in the first row and genotypes encoded as numbers between 0 and 2 representing the (posterior) mean genotype, or dosage of the minor allele.

sample_gts = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
  
  # Pull out random SNPs
  snp_sample = gt_nomiss_list[[COUNTER]] %>% 
    dplyr::slice_sample(n = n_loci) %>% 
    dplyr::arrange(CHROM, BIN_START) %>% 
    # create locus column
    dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>% 
    # remove superfluous columns
    dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>% 
    # recode back to 0,1,2
    dplyr::mutate(dplyr::across(-LOCUS,
                                ~dplyr::recode(.x,
                                               `-1` = 0,
                                               `0` = 1,
                                               `1` = 2))) %>% 
    # reorder columns
    dplyr::select(LOCUS, everything())
  
})
names(sample_gts) = names(gt_nomiss_list)

purrr::map(seq_along(sample_gts), function(COUNTER){
  # save to file
  bin_length = names(sample_gts)[COUNTER]
  out_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
  readr::write_csv(sample_gts[[COUNTER]], out_file)
})

saveRDS(sample_gts, file.path(out_dir_test, "target_loci/sample_gts.rds"))

9.2.2 Simulate phenotype

sim_path = file.path(out_dir_test, "simulated_phenotypes/20211103_sim_phenos.rds")
set.seed(5671)
# N samples
N = n_samples
# N phenotypes
P = 1
# Proportion of total genetic variance
genVar = 0.5
# Proportion of genetic variance of genetic variant effects
h2s = 1
# Proportion of total noise variance
noiseVar = 0.5
# Proportion of noise variance of observational noise effects
phi = 1

sim_phenos = purrr::map(seq_along(sample_gts), function(COUNTER){
  # get sample file path
  bin_length = names(sample_gts)[COUNTER]
  gt_sample_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
    
  sim_pheno = PhenotypeSimulator::runSimulation(N = N, P = P, 
                                                genVar = genVar, h2s = h2s, 
                                                noiseVar = noiseVar, phi = phi,
                                                cNrSNP = n_loci,
                                                genotypefile = gt_sample_file,
                                                format = "delim",
                                                genoDelimiter = ",")
  
  return(sim_pheno)
})
names(sim_phenos) = names(sample_gts)

saveRDS(sim_phenos, sim_path)

# Write as .xlsx to use in same Snakemake code as true GWLS
lapply(seq_along(sim_phenos), function(COUNTER){
  out = tibble::tibble(fish = colnames(sample_gts[[COUNTER]])[-1],
                       Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y)
  # set path for output
  out_file = file.path(lts_dir, 
                       "association_testing/20211027_test/simulated_phenotypes",
                       paste(names(sim_phenos)[COUNTER], ".xlsx", sep = ""))
  # write to file
  openxlsx::write.xlsx(out, out_file, overwrite = T)
})

9.3 Reformat genotypes for GridLMM

9.3.1 Complete cases

final_nomiss = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
  out = list()
  
  # Genotypes
  out[["genotypes"]] = gt_nomiss_list[[COUNTER]] %>% 
    dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>% 
    # convert to matrix
    as.matrix(.) %>% 
    # transpose to put samples as rows
    t(.) %>% 
    # convert to data frame
    as.data.frame(.)
  
  # Positions
  out[["positions"]] = gt_nomiss_list[[COUNTER]] %>% 
    dplyr::select(CHROM, BIN_START, BIN_END)
  
  # Phenotypes
  out[["phenotypes"]] = data.frame(fish = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
                                   Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
                                     as.numeric()
                                   )
    
  return(out)
})
names(final_nomiss) = names(gt_nomiss_list)

9.3.2 With NAs

final_wimiss = purrr::map(seq_along(gt_list), function(COUNTER){
  out = list()
  
  # Genotypes
  out[["genotypes"]] = gt_list[[COUNTER]] %>% 
    dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>% 
    # convert to matrix
    as.matrix(.) %>% 
    # transpose to put samples as rows
    t(.) %>% 
    # convert to data frame
    as.data.frame(.)
  
  # Positions
  out[["positions"]] = gt_list[[COUNTER]] %>% 
    dplyr::select(CHROM, BIN_START, BIN_END)
  
  # Phenotypes
  out[["phenotypes"]] = data.frame(SAMPLE = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
                                   Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
                                     as.numeric()
                                   )
    
  return(out)
})
names(final_wimiss) = names(gt_list)

9.4 Test GridLMM

9.4.1 No missing

9.4.1.1 Run GWAS

test_out_nomiss = file.path(out_dir_test, "gwls_results", "gwas_results_nomiss.rds")
gwas_tests_nomiss = purrr::map(final_nomiss, function(BIN_LENGTH){
  run_gwas(d = BIN_LENGTH$genotypes,
           m = BIN_LENGTH$positions,
           p = BIN_LENGTH$phenotypes)
})

saveRDS(gwas_tests_nomiss, test_out_nomiss)

9.4.1.2 Plot

# Create custom Manhattan plot

test_man_nomiss = lapply(seq_along(gwas_tests_nomiss), function(COUNTER){
  # Get bin length
  BIN_LENGTH = names(gwas_tests_nomiss)[COUNTER] %>% 
    as.numeric(.)
  
  # Clean data frame
  test_results = gwas_tests_nomiss[[COUNTER]]$results %>% 
    dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>% 
    # add x-coord
    dplyr::mutate(X_COORD = pos + TOT) %>% 
    # change column names
    dplyr::rename(CHROM = Chr, BIN_START = pos) %>% 
    # add BIN_END
    dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>% 
    # add locus
    dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
    # target or not
    dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
                                          "yes",
                                          "no"),
                  TARGET = factor(TARGET, levels = c("yes", "no"))) %>% 
    # create vector of colours
    dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
                                            gtools::even(CHROM) ~ names(gwas_pal)[2],
                                            gtools::odd(CHROM) ~ names(gwas_pal)[3]),
                  # order so that `target` is plotted last, at the front
                  COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
                  SHAPE = dplyr::if_else(TARGET == "yes",
                                         18,
                                         20),
                  SIZE = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5),
                  ALPHA = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5)
                  )
  
  # Plot
  p1 = test_results %>% 
    ggplot(aes(x = X_COORD,
               y = -log10(p_value_REML),
               colour = COLOUR,
               shape = SHAPE,
               size = SIZE,
               alpha = ALPHA,
               label = CHROM,
               label2 = BIN_START,
               label3 = BIN_END)) + 
    geom_point() +
    aes(group = rev(TARGET)) +
    scale_color_manual(values = gwas_pal) +
    scale_shape_identity() +
    scale_size_identity() +
    scale_alpha_identity() +
    scale_x_continuous(breaks = med_chr_lens$MID_TOT, 
                       labels = med_chr_lens$chr) +
    theme_bw() +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank()
    ) +
    guides(colour = "none") + 
    ggtitle(paste("Bin length:", BIN_LENGTH)) +
    xlab("Chromosome") +
    ylab("-log10(p-value)") + 
    geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
    geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
  
  out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
  
  return(out)
})

9.4.2 Include loci with missing genotypes

9.4.2.1 Run GWAS

test_out_wimiss = file.path(out_dir_test, "gwls_results", "gwas_results_wimiss.rds")
gwas_tests_wimiss = purrr::map(final_wimiss, function(BIN_LENGTH){
  run_gwas(d = BIN_LENGTH$genotypes,
           m = BIN_LENGTH$positions,
           p = BIN_LENGTH$phenotypes)
})

saveRDS(gwas_tests_wimiss, test_out_wimiss)

9.4.2.2 Read in results from Snakemake script

gwas_tests_wimiss = purrr::map(bin_lengths, function(BIN_LENGTH){
  readRDS(file.path(lts_dir, "association_testing/20211104_test/test_results", paste(BIN_LENGTH, ".rds", sep = "")))
})

9.4.2.3 Plot

# Create custom Manhattan plot
gwas_pal = c("#2B2D42", "#F7B267", "#F25C54")
names(gwas_pal) = c("target", "even chr", "odd chr")
significance_line = 3.6
suggestive_line = 2.9

test_man_wimiss = purrr::map(seq_along(gwas_tests_wimiss), function(COUNTER){
  # Get bin length
  BIN_LENGTH = names(gwas_tests_wimiss)[COUNTER] %>% 
    as.numeric(.)
  
  # Clean data frame
  test_results = gwas_tests_wimiss[[COUNTER]]$results %>% 
    dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>% 
    # add x-coord
    dplyr::mutate(X_COORD = pos + TOT) %>% 
    # change column names
    dplyr::rename(CHROM = Chr, BIN_START = pos) %>% 
    # add BIN_END
    dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>% 
    # add locus
    dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
    # target or not
    dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
                                          "yes",
                                          "no"),
                  TARGET = factor(TARGET, levels = c("yes", "no"))) %>% 
    # create vector of colours
    dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
                                            gtools::even(CHROM) ~ names(gwas_pal)[2],
                                            gtools::odd(CHROM) ~ names(gwas_pal)[3]),
                  # order so that `target` is plotted last, at the front
                  COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
                  SHAPE = dplyr::if_else(TARGET == "yes",
                                         18,
                                         20),
                  SIZE = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5),
                  ALPHA = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5)
                  )
  
  # Plot
  p1 = test_results %>% 
    ggplot(aes(x = X_COORD,
               y = -log10(p_value_REML),
               colour = COLOUR,
               shape = SHAPE,
               size = SIZE,
               alpha = ALPHA,
               label = CHROM,
               label2 = BIN_START,
               label3 = BIN_END)) + 
    geom_point() +
    aes(group = rev(TARGET)) +
    scale_color_manual(values = gwas_pal) +
    scale_shape_identity() +
    scale_size_identity() +
    scale_alpha_identity() +
    scale_x_continuous(breaks = med_chr_lens$MID_TOT, 
                       labels = med_chr_lens$chr) +
    theme_bw() +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank()
    ) +
    guides(colour = "none") + 
    ggtitle(paste("Bin length:", BIN_LENGTH)) +
    xlab("Chromosome") +
    ylab("-log10(p-value)") + 
    geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
    geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
  
  out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
  
  return(out)

})

test_man_wimiss[[1]]
test_man_wimiss[[2]]
test_man_wimiss[[3]]
test_man_wimiss[[4]]

9.5 Get significance levels from permutations

perm_file = here::here("data/20211109_permutation_mins.csv")
PERMUTATIONS_DIR = file.path(lts_dir, "association_testing/20211109_permutations")
SITE_FILTERS = c("all_sites", "no_repeat_sites", "no_repeat_reads"); names(SITE_FILTERS) = SITE_FILTERS
TARGET_PHENOS = c("mean", "intercept"); names(TARGET_PHENOS) = TARGET_PHENOS
BIN_LENGTHS = c(5000, 10000, 15000, 20000); names(BIN_LENGTHS) = BIN_LENGTHS
N_PERMUTATIONS = 1:10

perms_df = purrr::map_dfr(SITE_FILTERS, function(SITE_FILTER){
  purrr::map_dfr(TARGET_PHENOS, function(TARGET_PHENO){
    purrr::map_dfr(BIN_LENGTHS, function(BIN_LENGTH){
      purrr::map_dfr(N_PERMUTATIONS, function(N_PERM){
        in_list = readRDS(file.path(PERMUTATIONS_DIR, SITE_FILTER, TARGET_PHENO, BIN_LENGTH, paste(N_PERM, ".rds", sep = "")))
        # Pull out minimum
        out_df = tibble::tibble(MIN_P = in_list$results$p_value_REML %>%
                                  min()
        )
      }, .id = "PERMUTATION")
    }, .id = "BIN_LENGTH")
  }, .id = "TARGET_PHENO")
}, .id = "SITE_FILTER")

# Get minimum
perms_df_mins = perms_df %>% 
  dplyr::group_by(SITE_FILTER, TARGET_PHENO, BIN_LENGTH) %>% 
  dplyr::summarise(MIN_P = min(MIN_P))

readr::write_csv(perms_df_mins,
                 perm_file)
perms_df_mins %>% 
  DT::datatable()